About

The Scripts

1) The Main Control File

This file loads all pre-requisites and defines characteristics.

  • No need to change anything in this file.

Set working directory

This ensures the script will read files from a specific location or write files to a specific location.

It should contain sub-folders for data, scripts, and output.

setwd("C:/Users/elizabeth.finn/OneDrive - Greater Manchester Combined Authority/SIPHER/syntheticPopulation/scripts/workshopVersions")

Clean workspace

rm(list = ls())   # remove all objects from work space 
gc(full = TRUE)   # deep clean garbage
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 539056 28.8    1196954   64   686411 36.7
## Vcells 986041  7.6    8388608   64  1876649 14.4
dir()             # check R has picked up relative file paths 
##  [1] "01_main.R"               "02_data_table.R"        
##  [3] "03_load_link.R"          "04_example_1.R"         
##  [5] "05_example_2.R"          "RData"                  
##  [7] "ROutput"                 "syntheticPopulation_v01"
##  [9] "v01.html"                "v01.Rmd"

Load relevant libraries

This list required packages and install them if required.

# List of required packages 
RequiredPackages <- c("data.table",  # for dialect 
                      "ggplot2",     # for plotting
                      "sf", "scales") # for maps

# Ensure all packages are installed and loaded 
.EnsurePackages <- function(packages_vector) {
  new_package <- packages_vector[!(packages_vector %in% 
                                     installed.packages()[, "Package"])]
  if (length(new_package) != 0) {
    install.packages(new_package) }
  sapply(packages_vector, suppressPackageStartupMessages(require),
         character.only = TRUE)
}
.EnsurePackages(RequiredPackages)
## Loading required package: data.table
## Loading required package: ggplot2
## Loading required package: sf
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
## Loading required package: scales
## data.table    ggplot2         sf     scales 
##       TRUE       TRUE       TRUE       TRUE

Definitions

Sets out key definitions of variables that will be used later in the scripts.

  • Important to note: This is where you can change the area you are interested in within the Synthetic Population. Can change it from England (‘E’) to Wales (‘W’), for example.
# Initialize definitions ...
definitions <- list()

# Start benchmarking
definitions$run_start <- Sys.time()

# Specify survey wave and country subset 
definitions$us_wave          <- c("k")   # "k" - US wave, do not change for now
definitions$geo_subset       <- c("E")   # "W" - for Wales (or "E" for England)
# might take a bit longer for England or England and Wales combined 

# Configure visuals  
definitions$fig_width  <- 25   # = width of output figures in cm
definitions$fig_height <- 15   # = height of output figures in cm
definitions$fig_unit   <- "cm" # = height of output figures in cm
definitions$map_cols   <- c("seagreen","lightgrey","salmon")

Finish benchmark and display duration

This is mainly to state how efficiently and quickly the code ran.

definitions$run_end <- Sys.time()            # end the benchmarking process 
definitions$run_end - definitions$run_start  # time it took to run
## Time difference of 0.02154398 secs

2) Dataset loading and merging

This demonstrates how to quickly load the Synthetic Population, Understanding Society (US) survey, and geo lookup.

  • This section also demonstrates how data can be reduced quickly and merged to fully set up the Synthetic Population.

Load in the Synthetic Population two column file

# load sp # 
sp_8cons <- data.table::fread("RData/sp_ind_wavek_census2011_est2020_8cons.csv")

# explore 
str(sp_8cons)      # general structure 
## Classes 'data.table' and 'data.frame':   52853971 obs. of  2 variables:
##  $ ZoneID: chr  "E01000001" "E01000001" "E01000001" "E01000001" ...
##  $ pidp  : num  5.44e+08 9.57e+08 1.57e+09 4.77e+08 5.45e+08 ...
##  - attr(*, ".internal.selfref")=<externalptr>
sp_8cons[1:5,]     # first 5 rows of all columns 
#  subset for geography of interest defined previously -> definitions$geo_subset
sp_8cons[, ctr_code := substr(ZoneID, 1, 1)] # first letter of Zone ID
sp_8cons <- sp_8cons[ctr_code %in% definitions$geo_subset, ]


Load Understanding Society (US) data for individuals and merge with Synth Pop

Load US data for individuals

us_ind <- data.table::fread("RData/k_indresp.tab")
dim(us_ind)      # general structure
## [1] 32008  3258
us_ind[1:5,1:5]  # first 5 rows of the first 5 variables


Rename: get rid of start with “k_”

data.table::setnames(us_ind, old = names(us_ind),
              new = gsub(paste0("^",definitions$us_wave,"_"),"",names(us_ind)))
us_ind[1:5,1:5]  # first 5 rows of the first 5 variables


Select variables to keep

us_ind_keep_var <- c("pidp", "hidp",  # id for individuals
                     "sex", "age_dv", # age and sex
                     "sf12mcs_dv",    # mental health indicator SF-12
                     "employ")        # employment
# apply selection 
us_ind <- us_ind[, ..us_ind_keep_var]


Merge Synth Pop with US data for individuals

sp_8cons <- merge(sp_8cons, us_ind, by = "pidp", all.x = TRUE) 
data.table::setkey(sp_8cons, ZoneID)
sp_8cons[1:5,]     # check result first 10 rows of all columns 

Load US raw data for households and merge with SynthPop

Load US data for households

us_hh <- data.table::fread("RData/k_hhresp.tab")
dim(us_hh)       # general structure
## [1] 18139   319
us_hh[1:5,1:5]   # first 5 rows of the first 5 variables
# rename: let's get rid of start with "k_" # 
data.table::setnames(us_hh, old = names(us_hh),
                     new = gsub(paste0("^",definitions$us_wave,"_"),"",names(us_hh)))
us_hh[1:5,1:5]  # first 5 rows of the first 5 variables


Select variables to keep

# us_hh_keep_var <- c("hidp",    # id for hh to link with individuals 
#                     "xphsdct") # financial hardship: problems paying council tax 

us_hh_keep_var <- c("hidp",    # id for hh to link with individuals 
                    "fihhmngrs_dv",
                    "xphsdct") # gross household income: month before interview

This is where you can also adapt the script for your own interests/purposes.

Apply selection of variables

us_hh <- us_hh[, ..us_hh_keep_var]

Merge Synth Pop with US data for households

# merge synthetic population with US data for households  
sp_8cons <- merge(sp_8cons, us_hh, by = "hidp", all.x = TRUE)  

# almost there! 
sp_8cons

Load the geographical lookup and merge this with the Synth Pop

Load Geo Lookup

A geo-lookup is a file which describes the hierarchy of geographies.

  • e.g., LSOA’s/DZ –> MSOA/IZ –> LA –> GOR –> Countries –> UK

  • A source of these files is can be found here.

lookup_oa <- data.table::fread("RData/OA_to_Local_Authority_District_May_2021.csv")
str(lookup_oa)
## Classes 'data.table' and 'data.frame':   2661131 obs. of  14 variables:
##  $ pcd7    : chr  "AB1 0AA" "AB1 0AB" "AB1 0AD" "AB1 0AE" ...
##  $ pcd8    : chr  "AB1  0AA" "AB1  0AB" "AB1  0AD" "AB1  0AE" ...
##  $ pcds    : chr  "AB1 0AA" "AB1 0AB" "AB1 0AD" "AB1 0AE" ...
##  $ dointr  : int  198001 198001 198001 199402 199012 199012 198001 198001 198001 198001 ...
##  $ doterm  : int  199606 199606 199606 199606 199207 199207 199606 199606 199606 199606 ...
##  $ usertype: int  0 0 0 0 1 1 0 0 1 0 ...
##  $ oa11cd  : chr  "S00090303" "S00090303" "S00090399" "S00091322" ...
##  $ lsoa11cd: chr  "S01006514" "S01006514" "S01006514" "S01006853" ...
##  $ msoa11cd: chr  "S02001237" "S02001237" "S02001237" "S02001296" ...
##  $ ladcd   : chr  "S12000033" "S12000033" "S12000033" "S12000034" ...
##  $ lsoa11nm: chr  "Cults, Bieldside and Milltimber West - 02" "Cults, Bieldside and Milltimber West - 02" "Cults, Bieldside and Milltimber West - 02" "Dunecht, Durris and Drumoak - 01" ...
##  $ msoa11nm: chr  "Cults, Bieldside and Milltimber Wes" "Cults, Bieldside and Milltimber Wes" "Cults, Bieldside and Milltimber Wes" "Dunecht, Durris and Drumoak" ...
##  $ ladnm   : chr  "Aberdeen City" "Aberdeen City" "Aberdeen City" "Aberdeenshire" ...
##  $ ladnmw  : chr  "" "" "" "" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Reduce look-up

lookup_oa <- unique(lookup_oa[, .(lsoa11cd, msoa11cd, ladcd, ladnm)])
str(lookup_oa)
## Classes 'data.table' and 'data.frame':   42632 obs. of  4 variables:
##  $ lsoa11cd: chr  "S01006514" "S01006853" "S01006511" "S01006506" ...
##  $ msoa11cd: chr  "S02001237" "S02001296" "S02001236" "S02001236" ...
##  $ ladcd   : chr  "S12000033" "S12000034" "S12000033" "S12000033" ...
##  $ ladnm   : chr  "Aberdeen City" "Aberdeenshire" "Aberdeen City" "Aberdeen City" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Merge the synthetic population with the reduced look-up file

sp_8cons <- merge(sp_8cons, lookup_oa, by.x = "ZoneID", by.y = "lsoa11cd",
                  all.x = TRUE)  

Sort columns and check the final dataset you are working with

Check dataset before reordering

sp_8cons[1:5,]

Sort columns by order geography levels and IDs

data.table::setcolorder(sp_8cons, neworder = c("ZoneID", "msoa11cd", "ladcd",
                                        "ladnm", "ctr_code",  "pidp", "hidp"))

Look at the data

sp_8cons[1:5,]
str(sp_8cons)
## Classes 'data.table' and 'data.frame':   45697898 obs. of  13 variables:
##  $ ZoneID      : chr  "E01000001" "E01000001" "E01000001" "E01000001" ...
##  $ msoa11cd    : chr  "E02000001" "E02000001" "E02000001" "E02000001" ...
##  $ ladcd       : chr  "E09000001" "E09000001" "E09000001" "E09000001" ...
##  $ ladnm       : chr  "City of London" "City of London" "City of London" "City of London" ...
##  $ ctr_code    : chr  "E" "E" "E" "E" ...
##  $ pidp        : int  68029931 68538656 68155063 68197887 68202647 68231211 69013378 68278127 68336615 68361771 ...
##  $ hidp        : int  68102020 68455620 68537220 68741220 68768420 68904420 68938420 69020020 69217220 69353220 ...
##  $ sex         : int  1 1 1 2 2 1 1 2 1 2 ...
##  $ age_dv      : int  50 32 26 58 44 52 70 72 31 43 ...
##  $ sf12mcs_dv  : num  54.2 52.1 54.1 57.3 40.8 ...
##  $ employ      : int  1 1 1 1 1 1 2 2 1 1 ...
##  $ fihhmngrs_dv: num  4352 4084 6896 1450 9204 ...
##  $ xphsdct     : int  2 2 2 2 2 1 2 2 2 2 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "ZoneID"

3) Example use

This example will allow the investigation of absolute numbers and prevalence of financial hardship across different types of areas.

For example, how many individuals (or % of individuals) experience financial problems within a certain area?

  1. Raw counts of individuals at LSOA level (simplest approach).

  2. Prevalence of individuals at Local Authority level (more complicated).

1) Raw Counts of Individuals at LSOA level

“Problems paying council tax” will be the variable used to represent financial hardship.

  • By searching here, you can see that we need the xphsdct ==1 to represent this.

Aggregation of a subset of this binary variable

# basic idea: an aggregation of a subset
sp_8cons[xphsdct == 1, ]
# ... an aggregation of a subset returning the number of observations
sp_8cons[xphsdct == 1, .(N_xphsdct = .N)]
# ... an aggregation of a subset returning the number of observations for DZs
sp_8cons[xphsdct == 1,  .(N_xphsdct = .N), by = c("ZoneID") ]
# ... assigned to a new object 
lsoa_hardship <- sp_8cons[xphsdct == 1,  .(N_xphsdct = .N), by = c("ZoneID") ]
lsoa_hardship
# create and display plot 
lsoa_hardship_plot <- ggplot2::ggplot(lsoa_hardship, aes(x = N_xphsdct)) + 
  geom_bar(width = 25)
  lsoa_hardship_plot
## Warning: `position_stack()` requires non-overlapping x intervals.

Aggregation of a subset of a continuous variable

Will use the “fihhmngrs_dv” continuous variable

  • This variable represents gross household income: month before interview.
sp_8cons[fihhmngrs_dv > 0, ]
# Aggregation of a subset returning the number of observations based on 'fihhmngrs_dv'
sp_8cons[fihhmngrs_dv > 0, .(N_fihhmngrs_dv = .N)]
# Aggregation of a subset returning the average value of 'fihhmngrs_dv' for each 'ZoneID'
avg_fihhmngrs <- sp_8cons[fihhmngrs_dv > 0, .(avg_fihhmngrs = mean(fihhmngrs_dv, na.rm = TRUE)), by = ZoneID]

avg_fihhmngrs
# Assign the result to a new object
lsoa_hardship <- sp_8cons[fihhmngrs_dv > 0, .(N_fihhmngrs_dv = .N), by = c("ZoneID")]
lsoa_hardship # Check the result

Aggregation of another variable ‘employ’

This variable indicates whether an individual is in paid employment (==1) or not (==2).

# basic idea: an aggregation of a subset
sp_8cons[employ == 1, ]
# ... an aggregation of a subset returning the number of observations
sp_8cons[employ == 1, .(employ = .N)]
# ... an aggregation of a subset returning the number of observations for DZs
sp_8cons[employ == 1,  .(employ = .N), by = c("ZoneID") ]
# ... assigned to a new object 
lsoa_hardship <- sp_8cons[employ == 1,  .(employ = .N), by = c("ZoneID") ]
lsoa_hardship # looking good
# Create and display a plot 
lsoa_hardship_plot <- ggplot2::ggplot(lsoa_hardship, aes(x = employ)) + 
  geom_bar(width = 25)
lsoa_hardship_plot
## Warning: `position_stack()` requires non-overlapping x intervals.

2) Prevalence at Local Authority level

# same format as above: "subset, execute/create, by group" with different a "by"
la_hardship_1 <- sp_8cons[employ == 1,  .(employ = .N), by = c("ladnm") ] 
la_hardship_1


How many adults there are within each Local Authority?

la_hardship_2 <- sp_8cons[,  .(N_adults = .N), by = c("ladnm") ] 
la_hardship_2

Combine the two

la_hardship_3 <- merge(la_hardship_1, la_hardship_2, by = c("ladnm"))
la_hardship_3[, prevalence := (employ / N_adults) * 100]

Calculate highest and lowest

la_hardship_3[prevalence == max(prevalence), ]  # where is absolute number highest?
la_hardship_3[prevalence == min(prevalence), ]  # where is absolute number lowest?

plot?

la_hardship_plot <- ggplot(la_hardship_3, aes(x = reorder(ladnm, prevalence), y = prevalence)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(x = "local authority",
       y = "prevalence of employment")

la_hardship_plot